V17 - Change sample to dataset; change project to cohort
V16 - use namer on chunks
V15 - change correlation plots to small dots
V14 - combine plots
V13 - change gene count threshold from categporical to numeric
V12 - exclude datasets with fewer than 100 measured genes and use pre-generated survey_data
V11 - color consistency
V10 - calculate non genic and all exonic counts for correlations analysis
V9 - add gene count; import readdist.txt for more nuanced comparison of cor with gene count
reference range for unmapped reads is calculated based on unmapped reads/total
reference range for duplicate reads is calculated based on duplicate reads/mapped reads where mapped reads = total mapped and multi-mapped (<20x) reads)
reference range for non-exonic reads is calculated based on non-exonic/non duplicate reads) where non duplicate reads = exonic and non-exonic non dupe reads)
The composition fractions are not dependent on higher level ones
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.3
## ✓ tibble 3.0.1 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.0 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## Warning: package 'tibble' was built under R version 3.6.2
## ── Conflicts ─────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(viridis)
## Loading required package: viridisLite
library(knitr)
library(ggthemes) # for theme_linedraw()
library(ggpubr) # for stat_cor
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(cowplot) # for ggdraw and draw_label
##
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
## default ggplot2 theme anymore. To recover the previous
## behavior, execute:
## theme_set(theme_cowplot())
## ********************************************************
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggpubr':
##
## get_legend
## The following object is masked from 'package:ggthemes':
##
## theme_map
library(RColorBrewer)
inputs_dir <- "inputs"
data_file <- "survey_data.2020-06-18_10.00.49.txt"
min_genes_gt_0 <- 100
is_outlier <- function(x) {
(x > quantile(x, 0.75) + 1.5*IQR(x)) |
(x < quantile(x, 0.25) - 1.5*IQR(x))
}
counts_and_meta <- read_tsv(file.path(inputs_dir, data_file)) %>%
group_by(cohort) %>%
mutate(n_in_cohort = n())
## Parsed with column specification:
## cols(
## .default = col_double(),
## dataset_id = col_character(),
## cohort = col_character(),
## source_accession = col_character(),
## original_sample_id = col_character(),
## disease = col_character(),
## sample_prefix = col_character(),
## pedaya = col_character(),
## gender = col_character()
## )
## See spec(...) for full column specifications.
# brewer.pal(8, "Paired")
these_colors <- c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", "#E31A1C",
"#FDBF6F", "#FF7F00")
# light blue, dark blue, etc: green, red, orange (then purple and brown)
these_colors <- c("#1F78B4", "#1F78B4", "#33A02C", "#33A02C", "#E31A1C", "#E31A1C",
"#FF7F00", "#FF7F00")
names(these_colors) <- c("not assigned", "Total reads", "Non-exonic reads", "Exonic reads", "Duplicate reads", "Non-duplicate reads", "Unmapped reads", "Mapped reads")
these_colors_with_MEND = c(these_colors, "MEND reads"="#000000")
nrow(counts_and_meta)
## [1] 2178
counts_and_meta %>% tabyl(disease)
## disease n percent
## acute lymphoblastic leukemia 680 0.3122130395
## acute megakaryoblastic leukemia 103 0.0472910927
## acute myeloid leukemia 221 0.1014692378
## acute undifferentiated leukemia 1 0.0004591368
## adrenocortical adenoma 1 0.0004591368
## adrenocortical cancer 2 0.0009182736
## adrenocortical carcinoma 19 0.0087235996
## alveolar rhabdomyosarcoma 40 0.0183654729
## cholangiocarcinoma 1 0.0004591368
## choroid plexus carcinoma 25 0.0114784206
## desmoplastic small round cell tumor 4 0.0018365473
## dysembryoplastic neuroepithelial tumor 13 0.0059687787
## embryonal rhabdomyosarcoma 42 0.0192837466
## embryonal tumor with multilayered rosettes 1 0.0004591368
## ependymoma 98 0.0449954086
## epithelioid sarcoma 1 0.0004591368
## Ewing sarcoma 70 0.0321395776
## fibrolamellar hepatocellular carcinoma 3 0.0013774105
## fibromatosis 1 0.0004591368
## gastrointestinal stromal tumor 4 0.0018365473
## germ cell tumor 1 0.0004591368
## glioblastoma multiforme 29 0.0133149679
## glioma 193 0.0886134068
## hepatoblastoma 1 0.0004591368
## hepatocellular carcinoma 3 0.0013774105
## hypereosinophillic syndrome 1 0.0004591368
## juvenile myelomonocytic leukemia 1 0.0004591368
## leukemia 1 0.0004591368
## lymphoma 49 0.0224977043
## medulloblastoma 201 0.0922865014
## melanoma 7 0.0032139578
## melanotic neuroectodermal tumor 1 0.0004591368
## meningioma 1 0.0004591368
## myofibromatosis 2 0.0009182736
## nasopharyngeal carcinoma 2 0.0009182736
## neuroblastoma 12 0.0055096419
## neuroendocrine carcinoma 1 0.0004591368
## not reported 3 0.0013774105
## osteosarcoma 157 0.0720844812
## ovarian serous cystadenocarcinoma 1 0.0004591368
## PEComa 1 0.0004591368
## pleuropulmonary blastoma 1 0.0004591368
## retinoblastoma 2 0.0009182736
## rhabdoid tumor 65 0.0298438935
## rhabdomyosarcoma 53 0.0243342516
## spindle cell/sclerosing rhabdomyosarcoma 3 0.0013774105
## supratentorial embryonal tumor NOS 18 0.0082644628
## synovial sarcoma 22 0.0101010101
## undifferentiated sarcoma NOS 3 0.0013774105
## undifferentiated spindle cell sarcoma 2 0.0009182736
## wilms tumor 11 0.0050505051
disease_freq_table <- counts_and_meta$disease %>%
str_to_sentence %>%
str_replace(" nos$", " NOS") %>%
as.factor %>%
fct_infreq %>%
fct_lump(prop=0.01) %>%
tabyl (var1 = "Disease") %>%
adorn_pct_formatting(digits = 1)
colnames(disease_freq_table)[1]="Disease"
write_tsv(disease_freq_table, "results/disease_freq_table.txt")
disease_freq_table %>%
kable
| Disease | n | percent |
|---|---|---|
| Acute lymphoblastic leukemia | 680 | 31.2% |
| Acute myeloid leukemia | 221 | 10.1% |
| Medulloblastoma | 201 | 9.2% |
| Glioma | 193 | 8.9% |
| Osteosarcoma | 157 | 7.2% |
| Acute megakaryoblastic leukemia | 103 | 4.7% |
| Ependymoma | 98 | 4.5% |
| Ewing sarcoma | 70 | 3.2% |
| Rhabdoid tumor | 65 | 3.0% |
| Rhabdomyosarcoma | 53 | 2.4% |
| Lymphoma | 49 | 2.2% |
| Embryonal rhabdomyosarcoma | 42 | 1.9% |
| Alveolar rhabdomyosarcoma | 40 | 1.8% |
| Glioblastoma multiforme | 29 | 1.3% |
| Choroid plexus carcinoma | 25 | 1.1% |
| Synovial sarcoma | 22 | 1.0% |
| Other | 130 | 6.0% |
# many are leukemias
table(str_detect(counts_and_meta$disease, "leukemia"))
##
## FALSE TRUE
## 1171 1007
library(ggthemes)
cohort_size_histogram <- ggplot(counts_and_meta) +
geom_histogram(aes(n_in_cohort, group=cohort), fill = "lightgrey", color = "black") +
# theme_linedraw(base_size = 20) +
theme_minimal(base_size = 20) +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) + xlab("Cohort size") +
ylab("Datasets") +
theme(legend.position="none")
cohort_size_histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# for cohorts that are more than 75% one disease, color by that disease
n_distinct(counts_and_meta$cohort)
## [1] 38
sort(unique(counts_and_meta$n_in_cohort))
## [1] 3 4 6 7 10 14 15 17 19 22 23 24 25 31 35 39 41
## [18] 52 62 63 65 73 75 84 88 89 127 137 337 406
counts_and_meta %>%
select(cohort, n_in_cohort) %>%
distinct %>%
pull(n_in_cohort) %>%
median
## [1] 24.5
table(counts_and_meta$pedaya, useNA = "always")
##
## No Unknown Yes, age < 30 years
## 51 3 1557
## <NA>
## 567
table(subset(counts_and_meta, is.na(pedaya))$dataset_prefix)
## Warning: Unknown or uninitialised column: `dataset_prefix`.
## < table of extent 0 >
n_ped_aya <- sum(counts_and_meta$pedaya=="Yes, age < 30 years" | grepl("TARGET", counts_and_meta$dataset_prefix), na.rm = TRUE)
## Warning: Unknown or uninitialised column: `dataset_prefix`.
n_ped_aya
## [1] 0
n_ped_aya/nrow(counts_and_meta)
## [1] 0
# more than 95% of which are pediatric <30; of the Datasets with specific ages at diagnosis reported, the median was 7
median(counts_and_meta$age_at_dx, na.rm = TRUE)
## [1] 9
table(is.na(counts_and_meta$age_at_dx))
##
## FALSE TRUE
## 1177 1001
seq_len_round_25 <- round(counts_and_meta$seq_length / 25) * 25
table(seq_len_round_25, useNA = "always")
## seq_len_round_25
## 25 50 75 100 125 <NA>
## 4 238 306 1358 27 245
tabyl(seq_len_round_25)
## seq_len_round_25 n percent valid_percent
## 25 4 0.001836547 0.002069322
## 50 238 0.109274564 0.123124677
## 75 306 0.140495868 0.158303156
## 100 1358 0.623507805 0.702534920
## 125 27 0.012396694 0.013967926
## NA 245 0.112488522 NA
table(is.na(counts_and_meta$seq_length))
##
## FALSE TRUE
## 1933 245
table(counts_and_meta$seq_length)
##
## 36 50 51 75 76 100 101 110 125
## 4 11 227 266 40 88 1255 15 27
summary(counts_and_meta$seq_length)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 36.00 75.00 101.00 90.97 101.00 125.00 245
IQR(counts_and_meta$seq_length, na.rm = TRUE)
## [1] 26
read_length_histogram <- ggplot(counts_and_meta) +
geom_histogram(aes(seq_length, group=cohort), fill = "lightgrey", color = "black") +
# theme_linedraw(base_size = 20) +
theme_minimal(base_size = 20) +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) +
xlab("Sequence length") +
ylab("Datasets")
read_length_histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 245 rows containing non-finite values (stat_bin).
subset(counts_and_meta, Mapped > Total_reads)
## # A tibble: 0 x 21
## # Groups: cohort [0]
## # … with 21 variables: dataset_id <chr>, MND <dbl>, MEND <dbl>,
## # Total_reads <dbl>, Mapped <dbl>, cohort <chr>, seq_length <dbl>,
## # source_accession <chr>, original_sample_id <chr>, disease <chr>,
## # sample_prefix <chr>, age_at_dx <dbl>, pedaya <chr>, gender <chr>,
## # genes_expressed_above_0 <dbl>, genes_expressed_above_1 <dbl>,
## # genes_expressed_above_2 <dbl>, genes_expressed_above_3 <dbl>,
## # genes_expressed_above_4 <dbl>, genes_expressed_above_5 <dbl>,
## # n_in_cohort <int>
subset(counts_and_meta, MND > Mapped)
## # A tibble: 0 x 21
## # Groups: cohort [0]
## # … with 21 variables: dataset_id <chr>, MND <dbl>, MEND <dbl>,
## # Total_reads <dbl>, Mapped <dbl>, cohort <chr>, seq_length <dbl>,
## # source_accession <chr>, original_sample_id <chr>, disease <chr>,
## # sample_prefix <chr>, age_at_dx <dbl>, pedaya <chr>, gender <chr>,
## # genes_expressed_above_0 <dbl>, genes_expressed_above_1 <dbl>,
## # genes_expressed_above_2 <dbl>, genes_expressed_above_3 <dbl>,
## # genes_expressed_above_4 <dbl>, genes_expressed_above_5 <dbl>,
## # n_in_cohort <int>
subset(counts_and_meta, MEND > MND)
## # A tibble: 0 x 21
## # Groups: cohort [0]
## # … with 21 variables: dataset_id <chr>, MND <dbl>, MEND <dbl>,
## # Total_reads <dbl>, Mapped <dbl>, cohort <chr>, seq_length <dbl>,
## # source_accession <chr>, original_sample_id <chr>, disease <chr>,
## # sample_prefix <chr>, age_at_dx <dbl>, pedaya <chr>, gender <chr>,
## # genes_expressed_above_0 <dbl>, genes_expressed_above_1 <dbl>,
## # genes_expressed_above_2 <dbl>, genes_expressed_above_3 <dbl>,
## # genes_expressed_above_4 <dbl>, genes_expressed_above_5 <dbl>,
## # n_in_cohort <int>
min(counts_and_meta$Total_reads)
## [1] 206916
Total_read_counts <- counts_and_meta %>%
ungroup %>%
mutate(dataset_value = Total_reads/1e6) %>%
summarize(variance= var(dataset_value),
min = min(dataset_value),
max = max(dataset_value),
median = median(dataset_value),
p25 = quantile(dataset_value, 0.25),
p75 = quantile(dataset_value, 0.75),
iqr = IQR(dataset_value))
kable(Total_read_counts %>%
select(-variance, -p25, -p75), digits = 1)
| min | max | median | iqr |
|---|---|---|---|
| 0.2 | 668 | 61.1 | 53.3 |
total_read_count_histogram <- ggplot(counts_and_meta) +
geom_histogram(aes(Total_reads/1E6), fill = "lightgrey", color = "black") +
# theme_linedraw(base_size = 20) +
theme_minimal(base_size = 20) +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) +
xlab("Total reads") +
ylab("Datasets")
total_read_count_histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
read_counts_with_read_type_fractions <- counts_and_meta %>%
arrange(desc(Total_reads)) %>%
mutate(
pct_unmapped_of_total = (Total_reads - Mapped) / Total_reads,
pct_dupe_of_mapped = (Mapped - MND) / Mapped,
pct_non_exonic_of_non_dupe = (MND - MEND) / MND
)
read_type_fractions_long <- read_counts_with_read_type_fractions %>%
# select(c(dataset_id, cohort_id, cohort, starts_with("pct_"))) %>%
pivot_longer(cols = starts_with("pct_"), names_to = "read_type_fraction_name", values_to = "dataset_value")
comparison_of_distributions <- read_type_fractions_long %>%
group_by(read_type_fraction_name) %>%
summarize(variance= var(dataset_value),
min = min(dataset_value),
max = max(dataset_value),
median = median(dataset_value),
p25 = quantile(dataset_value, 0.25),
p75 = quantile(dataset_value, 0.75),
iqr = IQR(dataset_value))
kable(comparison_of_distributions %>%
mutate_if(is.double, ~100*.), digits = 3)
| read_type_fraction_name | variance | min | max | median | p25 | p75 | iqr |
|---|---|---|---|---|---|---|---|
| pct_dupe_of_mapped | 5.886 | 2.853 | 99.997 | 26.895 | 13.449 | 43.053 | 29.604 |
| pct_non_exonic_of_non_dupe | 4.094 | 4.495 | 97.000 | 24.769 | 16.197 | 37.277 | 21.080 |
| pct_unmapped_of_total | 0.606 | 0.710 | 76.677 | 3.414 | 2.740 | 6.009 | 3.269 |
kable(comparison_of_distributions %>%
select(-variance, -p25, -p75) %>%
mutate_if(is.double, ~100*.), digits = 0)
| read_type_fraction_name | min | max | median | iqr |
|---|---|---|---|---|
| pct_dupe_of_mapped | 3 | 100 | 27 | 30 |
| pct_non_exonic_of_non_dupe | 4 | 97 | 25 | 21 |
| pct_unmapped_of_total | 1 | 77 | 3 | 3 |
# in 77 datasets, more than 25% of reads are unmapped
table(read_counts_with_read_type_fractions$pct_unmapped_of_total>0.25)
##
## FALSE TRUE
## 2101 77
# in 77 datasets, more than 25% of reads are unmapped
read_counts_with_read_type_fractions %>%
mutate(above_50 = pct_dupe_of_mapped>0.50) %>%
tabyl(above_50)
## above_50 n percent
## FALSE 1752 0.8044077
## TRUE 426 0.1955923
fiftypct <- read_counts_with_read_type_fractions %>%
group_by(cohort) %>%
summarize(has_a_dataset_with_more_than_50pct_dupes = any(pct_dupe_of_mapped>0.50),
median_pct_dupe_of_mapped = median(pct_dupe_of_mapped)) %>%
mutate(median_lt_50 = median_pct_dupe_of_mapped < 0.5,
median_lt_50_and_has_a_dataset = median_lt_50 & has_a_dataset_with_more_than_50pct_dupes)
fiftypct %>% tabyl(median_lt_50)
## median_lt_50 n percent
## FALSE 7 0.1842105
## TRUE 31 0.8157895
fiftypct %>% tabyl(has_a_dataset_with_more_than_50pct_dupes)
## has_a_dataset_with_more_than_50pct_dupes n percent
## FALSE 8 0.2105263
## TRUE 30 0.7894737
fiftypct %>% tabyl(median_lt_50_and_has_a_dataset)
## median_lt_50_and_has_a_dataset n percent
## FALSE 15 0.3947368
## TRUE 23 0.6052632
read_counts_with_read_type_fractions %>%
mutate(above_50 = pct_non_exonic_of_non_dupe>0.50) %>%
tabyl(above_50)
## above_50 n percent
## FALSE 1848 0.8484848
## TRUE 330 0.1515152
fiftypcte <- read_counts_with_read_type_fractions %>%
group_by(cohort) %>%
summarize(has_a_dataset_with_more_than_50pct_ne = any(pct_non_exonic_of_non_dupe>0.50),
median_pct_dupe_of_mapped = median(pct_non_exonic_of_non_dupe)) %>%
mutate(median_lt_50 = median_pct_dupe_of_mapped < 0.5,
median_lt_50_and_has_a_dataset = median_lt_50 & has_a_dataset_with_more_than_50pct_ne)
fiftypcte %>% tabyl(median_lt_50)
## median_lt_50 n percent
## FALSE 3 0.07894737
## TRUE 35 0.92105263
fiftypcte %>% tabyl(has_a_dataset_with_more_than_50pct_ne)
## has_a_dataset_with_more_than_50pct_ne n percent
## FALSE 26 0.6842105
## TRUE 12 0.3157895
fiftypcte %>% tabyl(median_lt_50_and_has_a_dataset)
## median_lt_50_and_has_a_dataset n percent
## FALSE 29 0.7631579
## TRUE 9 0.2368421
cor_tot_reads_and_dupes <- cor(read_counts_with_read_type_fractions$Total_reads,
read_counts_with_read_type_fractions$pct_dupe_of_mapped,
method = c("pearson"))
round(cor_tot_reads_and_dupes, 2)
## [1] 0.58
round(cor_tot_reads_and_dupes^2, 2)
## [1] 0.34
# p <- ggplot(read_counts_with_read_type_fractions, aes(x=Total_reads/1e6, y=pct_dupe_of_mapped)) +
# geom_point() +
# geom_smooth(method=lm, se=FALSE)
#
# # Add correlation coefficient
# p + stat_cor(method = "pearson") #, label.x = 250 * 1e6, label.y = 0.2)
# 0.64^2
this_data <- read_counts_with_read_type_fractions %>%
select(Total_reads, pct_dupe_of_mapped)
## Adding missing grouping variables: `cohort`
this_plot_title <- paste("Correlation of Total_reads and % Duplicates")
these_stats <- this_data %>%
ungroup %>%
summarize(r_corr = round(cor(Total_reads, pct_dupe_of_mapped), 2))
ggplot(this_data,
aes(x=Total_reads/1E6, y=pct_dupe_of_mapped)) +
geom_point(alpha = 0.5, shape = 20, size =0.1) +
geom_smooth(method = "lm", color = "black", se = FALSE) +
geom_label(data = these_stats,
aes(label=paste0("r=", r_corr)),
x=max(this_data$Total_reads/1E6),
y=max(this_data$pct_dupe_of_mapped),
hjust = 1, vjust = 1
) +
theme_minimal(base_size = 20) +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) +
ggtitle(this_plot_title) +
theme(legend.position="none") +
xlab("Read counts (million)") +
ylab("% Duplicates")
## `geom_smooth()` using formula 'y ~ x'
colors_for_read_types <- these_colors_with_MEND
names(colors_for_read_types) <- gsub(" reads", "", names(these_colors_with_MEND))
colors_for_read_types <- c(colors_for_read_types,
"Genome" = "darkblue",
"Gene" = "darkblue",
"Reads" = "darkgrey")
plot1_colnames <- c("label", "x1", "x2", "y1", "y2")
read_type_schematic_data <- matrix(c("Duplicate", 3, 4, 6, 7,
"Duplicate", 3, 4, 5, 6,
"MEND", 3, 4, 4, 5,
"MEND", 2.5, 3.5, 3, 4,
"Non-exonic", 4.5, 5.5, 3, 4,
"Unmapped", 6, 7, 3, 4,
"Gene", 2.5, 4, 1, 2,
"Genome", 4.5, 5.5, 0.5, 1.5,
"Reads", 2.5, 2.5, 6, 7),
ncol = length(plot1_colnames), byrow = TRUE,
dimnames = list(c(1:9), plot1_colnames)) %>%
as_tibble %>%
type_convert() %>%
mutate(
base_color = colors_for_read_types[match(label, names(colors_for_read_types))],
border_color = case_when(
label %in% c("Reads", "Genome", "MEND", "Gene") ~ NA_character_,
TRUE ~ base_color),
fill_color = ifelse(label %in% c("MEND", "Gene"), base_color, "white"),
text_color = case_when(
label %in% c("MEND", "Gene")~ "white",
TRUE ~ base_color),
mid_bar_x = map2_dbl(x1, x2, function(x, y) mean(c(x,y))),
mid_bar_y = map2_dbl(y1, y2, function(x, y) mean(c(x,y)))
)
## Parsed with column specification:
## cols(
## label = col_character(),
## x1 = col_double(),
## x2 = col_double(),
## y1 = col_double(),
## y2 = col_double()
## )
padding_size = 0.35
read_type_schematic <- ggplot(read_type_schematic_data,
aes(xmin=x1, xmax=x2, ymin=y1, ymax = y2,
fill = fill_color, color = border_color)) +
geom_rect() +
geom_segment(x=min(read_type_schematic_data$x1-padding_size), y=1.5, xend = 5.75, yend = 1.5, color = "darkblue") +
geom_text(aes(label = label, x = mid_bar_x, y=mid_bar_y,
color = text_color),
size = 4,
fontface = "bold") +
geom_rect(aes(xmin = min(x1 - padding_size),
ymin = 2.5,
xmax = max(x2 + padding_size),
ymax=max(y2 + padding_size)),
fill = NA, color = "darkgrey", size = 0.25) +
scale_fill_identity() +
scale_color_identity() +
theme_void()
read_type_schematic
stat_names <- c("pct_unmapped_of_total", "pct_dupe_of_mapped", "pct_non_exonic_of_non_dupe")
# complement_stat_names <- c("pct_mapped_of_total", "pct_nondupe_of_mapped", "pct_exonic_of_non_dupe")
stat_labels <- c("Unmapped", "Duplicate", "Non-exonic")
complement_stat_names <- c("Mapped reads", "Non-duplicate reads", "Exonic reads")
divisor_name = c("total", "mapped", "non-duplicate")
# names(divisor_name) =
comparison_of_remaining <- read_type_fractions_long %>%
mutate(read_type_fraction_name = complement_stat_names[match(read_type_fraction_name, stat_names)]) %>%
group_by(read_type_fraction_name) %>%
summarize(min = round(1-max(dataset_value), 2),
max = round(1-min(dataset_value), 2),
median = round(1-median(dataset_value), 2),
p25 = round(1-quantile(dataset_value, 0.25), 2),
p75 = round(1-quantile(dataset_value, 0.75), 2)) %>%
mutate(bar_label = paste0 (read_type_fraction_name, "\n(", 100*min, "-", 100*max, "% of ", divisor_name[match(read_type_fraction_name, complement_stat_names)], "; x=", median, "%)"))
positive_bars <- comparison_of_remaining %>%
select(-min, -max, -p25, -p75) %>%
mutate(read_type_fraction_label = read_type_fraction_name,
position = "1") %>%
rename(abs_median = median)
negative_bars <- tibble(read_type_fraction_name = positive_bars$read_type_fraction_name,
read_type_fraction_label = stat_labels[match(read_type_fraction_name, complement_stat_names)],
bar_label = read_type_fraction_label,
abs_median = 1-positive_bars$abs_median,
position = "2")
total_bar <- tibble(read_type_fraction_name = "Total reads",
read_type_fraction_label = read_type_fraction_name,
bar_label = "Total reads\n(100% of reads)",
abs_median = 1,
position = "1",
#category_space = 1
)
MEND_stats_of_total <- counts_and_meta %>%
arrange(desc(Total_reads)) %>%
mutate(
pct_MEND_of_total = (Total_reads - MEND) / Total_reads
) %>%
ungroup %>%
summarize(min = round(1-max(pct_MEND_of_total), 2),
max = round(1-min(pct_MEND_of_total), 2),
median = round(1-median(pct_MEND_of_total), 2))
MEND_bar <- tibble(read_type_fraction_name = "MEND reads",
read_type_fraction_label = read_type_fraction_name,
bar_label = with(MEND_stats_of_total, paste0 ("MEND reads\n(", 100*min, "-", 100*max, "% of total reads; x=", median, "%)")),
abs_median = 1,
position = "1",
#category_space = 1
)
bar_names_in_order <- c("Total reads", "Mapped reads", "Non-duplicate reads", "Exonic reads", "MEND reads")
all_bars <- bind_rows(positive_bars, negative_bars, total_bar, MEND_bar) %>%
mutate(read_type_fraction_name = factor(read_type_fraction_name,
levels = bar_names_in_order))
cat_space <- all_bars %>% filter(position == 1) %>%
arrange(read_type_fraction_name) %>%
select(read_type_fraction_name, abs_median) %>%
ungroup %>%
mutate(lagged1_abs_median = lag(abs_median, default =1),
lagged2_abs_median = lag(abs_median, n=2, default =1),
category_space = lagged1_abs_median*lagged2_abs_median
) %>%
select(read_type_fraction_name, category_space)
these_colors_with_MEND = c(these_colors, "MEND reads"="#000000")
all_bars_rel <- left_join(all_bars, cat_space, by = "read_type_fraction_name") %>%
mutate(rel_median=ifelse(read_type_fraction_name == "MEND reads",
abs_median[read_type_fraction_label=="Exonic reads"]*
category_space[read_type_fraction_label=="Exonic reads"],
abs_median*category_space),
read_type_fraction_name = factor(read_type_fraction_name, levels = rev(bar_names_in_order)),
position = factor(position, levels = rev(sort(unique(position)))),
bar_color = these_colors_with_MEND[match(read_type_fraction_name, names(these_colors_with_MEND))],
fill_val = ifelse(position == 1, bar_color, NA),
# border_color_val = ifelse(position ==1, NA, bar_color),
border_color_val = bar_color,
text_color = ifelse(position ==1, "white", "black"),
)
read_type_fractions_schematic <- ggplot(all_bars_rel, aes(y=read_type_fraction_name,
x=rel_median, group = position)) +
geom_col(aes(fill = fill_val,
color = border_color_val
)) +
geom_text(aes(label=bar_label,
color = text_color),
position = position_stack(vjust = .5),
# size = 4.5,
size = 3.5,
fontface = "bold") +
scale_fill_identity() +
scale_color_identity() +
theme_void() +
theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())
ggplot(all_bars_rel, aes(y=read_type_fraction_name,
x=rel_median, group = position)) +
geom_col(aes(fill = fill_val,
color = border_color_val
)) +
geom_text(aes(label=read_type_fraction_label,
color = text_color),
position = position_stack(vjust = .5),
size = 4.5,
fontface = "bold") +
scale_fill_identity() +
scale_color_identity() +
theme_void() +
theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())
read_type_fractions_schematic
figure_name <- "f1"
plot_grid(cohort_size_histogram,
read_type_schematic,
read_length_histogram,
read_type_fractions_schematic,
nrow=2,
labels = c("A", "C", "B", "D")) +
draw_label(paste(figure_name, Sys.time()),
x = 0, y = 0.02, hjust = 0, size = 6)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 245 rows containing non-finite values (stat_bin).
stat_names <- c("pct_unmapped_of_total", "pct_dupe_of_mapped", "pct_non_exonic_of_non_dupe")
stat_label <- c("% unmapped", "% duplicate \n(of mapped)", "% non-exonic \n(of non-duplicate)")
names(stat_label) <- stat_names
colors_for_stat_names <- these_colors[c("Mapped reads", "Non-duplicate reads","Exonic reads")]
names(colors_for_stat_names) <- stat_names
these_colors <- c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", "#E31A1C",
"#FDBF6F", "#FF7F00")
# light blue, dark blue, etc: green, red, orange (then purple and brown)
names(these_colors) <- c("not assigned", "Total reads", "Non-exonic reads", "Exonic reads", "Duplicate reads", "Non-duplicate reads", "Unmapped reads", "Mapped reads")
# 6 x 11 was a good ratio, but the text was too small
read_type_fractions_long_for_histogram <- read_type_fractions_long %>%
mutate(read_type_fraction_name = factor(read_type_fraction_name,
levels = stat_names))
# StatBin2 allows depiction of empty bins as blank instead of a horizontal line:
# https://stackoverflow.com/questions/57128090/remove-baseline-color-for-geom-histogram
StatBin2 <- ggproto(
"StatBin2",
StatBin,
compute_group = function (data, scales, binwidth = NULL, bins = NULL,
center = NULL, boundary = NULL,
closed = c("right", "left"), pad = FALSE,
breaks = NULL, origin = NULL, right = NULL,
drop = NULL, width = NULL) {
if (!is.null(breaks)) {
if (!scales$x$is_discrete()) {
breaks <- scales$x$transform(breaks)
}
bins <- ggplot2:::bin_breaks(breaks, closed)
}
else if (!is.null(binwidth)) {
if (is.function(binwidth)) {
binwidth <- binwidth(data$x)
}
bins <- ggplot2:::bin_breaks_width(scales$x$dimension(), binwidth,
center = center, boundary = boundary,
closed = closed)
}
else {
bins <- ggplot2:::bin_breaks_bins(scales$x$dimension(), bins,
center = center, boundary = boundary,
closed = closed)
}
res <- ggplot2:::bin_vector(data$x, bins, weight = data$weight, pad = pad)
# drop 0-count bins completely before returning the dataframe
res <- res[res$count > 0, ]
res
})
read_type_fractions_histogram <- ggplot(read_type_fractions_long_for_histogram) +
# geom_histogram(aes(x = dataset_value, fill = read_type_fraction_name)) +
geom_histogram(aes(x = dataset_value, color = read_type_fraction_name),
fill = "white", stat = StatBin2) +
# scale_fill_brewer(palette = "Set1") +
scale_color_manual(values = colors_for_stat_names) +
facet_wrap(~read_type_fraction_name,
nrow = 1,
scales = "free_y",
labeller = labeller(
read_type_fraction_name = stat_label
)
) +
ylab("Datasets") +
xlab(paste0("Read type percentages in ", length(unique(read_type_fractions_long_for_histogram$dataset_id)), " datasets")) +
scale_x_continuous(breaks = seq(0, 1, by = 0.5)) +
# expand_limits(x = c(-0.25, 0.9)) +
# ggtitle(plot_title) +
theme_minimal() +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) +
# theme_linedraw(base_size = 20) +
theme(strip.text.x = element_text(size = 12, face = "bold")) +
theme(legend.position = "none")
read_type_fractions_histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# 75% of datasets have fewer than 7% unmapped reads
read_counts_with_read_type_fractions
## # A tibble: 2,178 x 24
## # Groups: cohort [38]
## dataset_id MND MEND Total_reads Mapped cohort seq_length
## <chr> <dbl> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 TARGET-21… 1.75e8 7.59e7 667997032 4.59e8 TARGE… 75
## 2 THR33_099… 3.02e8 1.70e8 549163266 5.13e8 EGAD0… 51
## 3 TARGET-21… 1.22e8 6.77e7 532566307 3.58e8 TARGE… 75
## 4 TARGET-21… 7.91e7 4.76e7 475949820 3.01e8 TARGE… 75
## 5 TARGET-21… 8.02e7 5.81e7 466002909 2.85e8 TARGE… 75
## 6 TARGET-21… 7.45e7 3.59e7 465622167 2.16e8 TARGE… 75
## 7 TARGET-21… 7.30e7 4.37e7 463732470 2.59e8 TARGE… 75
## 8 TARGET-21… 1.02e8 5.33e7 461425978 2.66e8 TARGE… 75
## 9 TARGET-21… 8.85e7 4.64e7 461393140 2.78e8 TARGE… 75
## 10 TARGET-21… 7.31e7 4.07e7 461017023 2.48e8 TARGE… 75
## # … with 2,168 more rows, and 17 more variables: source_accession <chr>,
## # original_sample_id <chr>, disease <chr>, sample_prefix <chr>,
## # age_at_dx <dbl>, pedaya <chr>, gender <chr>,
## # genes_expressed_above_0 <dbl>, genes_expressed_above_1 <dbl>,
## # genes_expressed_above_2 <dbl>, genes_expressed_above_3 <dbl>,
## # genes_expressed_above_4 <dbl>, genes_expressed_above_5 <dbl>,
## # n_in_cohort <int>, pct_unmapped_of_total <dbl>,
## # pct_dupe_of_mapped <dbl>, pct_non_exonic_of_non_dupe <dbl>
comparison_of_distributions
## # A tibble: 3 x 8
## read_type_fraction_na… variance min max median p25 p75 iqr
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 pct_dupe_of_mapped 0.0589 0.0285 1.00 0.269 0.134 0.431 0.296
## 2 pct_non_exonic_of_non… 0.0409 0.0450 0.970 0.248 0.162 0.373 0.211
## 3 pct_unmapped_of_total 0.00606 0.00710 0.767 0.0341 0.0274 0.0601 0.0327
read_counts_with_MEND_fractions <- counts_and_meta %>%
arrange(desc(Total_reads)) %>%
mutate(pct_MEND_of_total = MEND /Total_reads)
ggplot(read_counts_with_MEND_fractions) +
geom_histogram(aes(pct_MEND_of_total)) +
theme_minimal(base_size = 20) +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
MEND_pct <- read_counts_with_MEND_fractions %>%
ungroup %>%
mutate(dataset_value = pct_MEND_of_total) %>%
summarize(variance= var(dataset_value),
min = min(dataset_value),
max = max(dataset_value),
median = median(dataset_value),
p25 = quantile(dataset_value, 0.25),
p75 = quantile(dataset_value, 0.75),
iqr = IQR(dataset_value))
kable(MEND_pct %>%
select(-variance, -p25, -p75) %>%
mutate_if(is.double, ~100*.), digits = 0)
| min | max | median | iqr |
|---|---|---|---|
| 0 | 79 | 50 | 31 |
figure_name <- "fracs_by_group"
read_type_fractions_long_for_boxplot <- read_type_fractions_long_for_histogram %>%
ungroup %>%
mutate(boxplot_label = fct_reorder(paste0(as.character(cohort), ", n=", n_in_cohort), n_in_cohort))
read_type_fractions_per_cohort_boxplot <- ggplot(read_type_fractions_long_for_boxplot) +
geom_boxplot(aes(x = dataset_value, y=boxplot_label), outlier.size = 0.5) +
facet_wrap(~read_type_fraction_name,
nrow = 1,
labeller = labeller(
read_type_fraction_name = stat_label
)
) +
ylab("Cohorts") +
xlab(paste0("Read type percentages in ", length(unique(read_type_fractions_long_for_boxplot$dataset_id)), " datasets")) +
scale_x_continuous(breaks = seq(0, 1, by = 0.5)) +
theme_minimal() +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) +
theme(strip.text.x = element_text(size = 12, face = "bold")) +
theme(legend.position = "none")
read_type_fractions_per_cohort_boxplot
#
# f2ab <- plot_grid(read_type_fractions_histogram +
# theme(strip.text.x = element_text(size = 10, face = "bold"),
# axis.title.x=element_blank(),
# axis.text.x=element_blank()),
# read_type_fractions_per_cohort_boxplot +
# theme(strip.text.x = element_blank(),
# strip.background = element_blank(),
# strip.text = element_blank()),
# ncol = 1,
# rel_heights = c(1.5, 6),
# labels = "AUTO",
# align = "v")
Northcott, P.A., Buchhalter, I., Morrissy, A.S., Hovestadt, V., Weischenfeldt, J., Ehrenberger, T., Gröbner, S., Segura-Wang, M., Zichner, T., Rudneva, V.A., et al [Lichter]. (2017). The whole-genome landscape of medulloblastoma subtypes. Nature 547, 311–317.
two of the senior authors are from the German Cancer Research Center (DKFZ), one is Roland Eils German Cancer Research Center (DKFZ) Marco A. Marra from BC, Stefan M. Pfister German Cancer Research Center (DKFZ) Michael D. Taylor Sick kids Peter Lichter German Cancer Research Center (DKFZ)
RNA-Seq usage 1) CESAM integrates structural variant-derived breakpoints with RNA-seq data to identify expression changes associated with breakpoints in cis as described in previously37, by performing linear regression of expression (molecular phenotype) on structural variant-derived breakpoint (somatic genotype) data.
Transcriptome data were acquired through RNA sequencing (RNA-seq; n = 164) and Affymetrix expression arrays (n = 392).
https://www.nature.com/articles/nature22973
read_type_names <- c("Total_reads", "Mapped", "MND", "MEND")
this_cohort <- "EGAD00001003279"
read_counts_with_read_type_fractions %>%
filter(cohort == this_cohort) %>%
mutate(gt_98 = pct_dupe_of_mapped > 0.98) %>%
tabyl(gt_98) %>%
add_totals_row
## Warning: 'add_totals_row' is deprecated.
## Use 'adorn_totals("row")' instead.
## See help("Deprecated")
## gt_98 n percent
## FALSE 55 0.4330709
## TRUE 72 0.5669291
## Total 127 1.0000000
read_counts_with_read_type_fractions %>%
filter(cohort == this_cohort) %>%
filter(pct_dupe_of_mapped > 0.98) %>%
pull(Total_reads) %>% min
## [1] 178294341
counts_and_meta_long <- counts_and_meta %>%
pivot_longer(cols = c(Total_reads, Mapped, MEND, MND), names_to = "read_type_name", values_to = "dataset_value") %>%
ungroup %>%
mutate(read_type_name = factor(read_type_name, levels = read_type_names))
# read_label <- c("% unmapped", "% duplicate \n(of mapped)", "% non-exonic \n(of non-duplicate)")
# names(read_label) <- read_type_names
counts_and_meta_long_one_proj <- counts_and_meta_long %>%
filter(cohort == this_cohort)
table(counts_and_meta_long_one_proj$read_type_name)
##
## Total_reads Mapped MND MEND
## 127 127 127 127
# 78 datasets in the cohort have fewer than 0.2 million MEND reads.
sum((counts_and_meta_long_one_proj %>% filter(read_type_name=="MEND"))$dataset_value < 200000)
## [1] 78
those_datasets <- counts_and_meta_long_one_proj %>% filter(read_type_name=="MEND", dataset_value < 200000) %>% pull(dataset_id)
summary((counts_and_meta_long_one_proj %>% filter(read_type_name=="Total_reads", dataset_id %in% those_datasets))$dataset_value)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 178294341 199916664 208667776 211606891 223832191 247618832
max_plots_to_include <- 10
n_datasets_to_plot <- ifelse(n_distinct(counts_and_meta_long_one_proj$dataset_id)>max_plots_to_include,
max_plots_to_include,
n_distinct(counts_and_meta_long_one_proj$dataset_id))
plot_word <- ifelse(n_datasets_to_plot==max_plots_to_include, max_plots_to_include, paste("all", n_distinct(counts_and_meta_long_one_proj$dataset_id)))
set.seed(100)
datasets_to_plot <- sample(unique(counts_and_meta_long_one_proj$dataset_id), n_datasets_to_plot)
ggplot(subset(counts_and_meta_long_one_proj, dataset_id %in% datasets_to_plot)) +
geom_col(aes(x=read_type_name, y=dataset_value/1e6, fill = read_type_name)) +
facet_wrap(~dataset_id, ncol = 1, strip.position="right") +
theme_minimal(base_size = 20) +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) +
theme(strip.text.y = element_text(angle = 0)) +
scale_fill_brewer(palette = "Set1") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
ggtitle(paste("Read counts for", plot_word, "datasets from", this_cohort)) +
coord_flip()
ggplot(counts_and_meta_long_one_proj %>% filter(read_type_name == "MEND")) +
geom_histogram(aes(dataset_value/1e6)) +
theme_minimal(base_size = 20) +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) +
ggtitle(paste("Distribution of MEND counts in", this_cohort))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
bl <- colorRampPalette(c("navy","royalblue","lightskyblue"))(200)
re <- colorRampPalette(c("mistyrose", "red2","darkred"))(200)
ggplot(counts_and_meta %>%
filter(cohort == this_cohort) %>%
mutate(pct_dupes = (Mapped - MND)/Mapped)) +
geom_point(aes(x=MEND/1e6, y=pct_dupes, fill = Total_reads/1e6), shape=21, color = "black") +
theme_minimal(base_size = 20) +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) +
# scale_color_gradientn() +
scale_fill_gradientn(colours = c(bl,"white", re)) +
ggtitle(paste("Relationship of dupe fraction to MEND count in", this_cohort), "For detecting whether datasets with less info were sequenced more deeply. \nNormally, higher depth datasets have more MEND reads and more duplicate \nreads than the same dataset sequenced at a lower total depth.")
counts_and_meta_long_one_proj %>%
select(seq_length, dataset_id) %>%
distinct %>%
tabyl(seq_length)
## seq_length n percent
## 51 127 1
expressed_gene_counts_long <- counts_and_meta %>%
pivot_longer(starts_with("genes_expressed_above")) %>%
mutate(min_gene_expr_tpm = gsub("genes_expressed_above_", "", name))
ggplot (expressed_gene_counts_long) +
geom_histogram(aes(x=value)) +
facet_wrap(~name)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
sort(counts_and_meta$genes_expressed_above_0) [1:100]
## [1] 9 9 12 12 13 14 14 15 16 16 17
## [12] 18 18 18 18 18 18 19 19 19 19 20
## [23] 20 20 20 20 20 21 21 21 21 21 21
## [34] 22 22 22 23 23 23 23 23 24 24 24
## [45] 24 24 24 24 25 25 25 26 26 26 26
## [56] 27 27 27 27 28 28 28 28 29 29 29
## [67] 29 29 30 30 31 31 32 33 35 36 37
## [78] 40 6098 7107 11830 15323 16793 16837 16943 17233 17531 18288
## [89] 18588 18704 19626 19693 19749 19781 19942 19963 20063 20120 20203
## [100] 20247
library(corrr)
counts_and_meta %>%
ungroup %>%
select_if(is.double) %>%
correlate() %>%
filter(rowname %in% read_type_names) %>%
rename(Read_type_count=rowname) %>%
mutate(Read_type_count = factor(Read_type_count, levels = read_type_names)) %>%
arrange(Read_type_count) %>%
select(c(Read_type_count, starts_with("genes_expressed"))) %>%
kable(caption = "Correlations with all datasets", digits = 2)
##
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
| Read_type_count | genes_expressed_above_0 | genes_expressed_above_1 | genes_expressed_above_2 | genes_expressed_above_3 | genes_expressed_above_4 | genes_expressed_above_5 |
|---|---|---|---|---|---|---|
| Total_reads | -0.20 | -0.40 | -0.40 | -0.40 | -0.39 | -0.39 |
| Mapped | -0.19 | -0.41 | -0.41 | -0.41 | -0.40 | -0.39 |
| MND | 0.51 | 0.24 | 0.20 | 0.18 | 0.17 | 0.16 |
| MEND | 0.48 | 0.27 | 0.26 | 0.26 | 0.26 | 0.26 |
counts_and_meta %>%
filter(genes_expressed_above_0 > min_genes_gt_0) %>%
ungroup %>%
select_if(is.double) %>%
correlate() %>%
filter(rowname %in% read_type_names) %>%
rename(Read_type_count=rowname) %>%
mutate(Read_type_count = factor(Read_type_count, levels = read_type_names)) %>%
arrange(Read_type_count) %>%
select(c(Read_type_count, starts_with("genes_expressed"))) %>%
kable(caption = "Correlations with all but low gene count datasets", digits = 2)
##
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
| Read_type_count | genes_expressed_above_0 | genes_expressed_above_1 | genes_expressed_above_2 | genes_expressed_above_3 | genes_expressed_above_4 | genes_expressed_above_5 |
|---|---|---|---|---|---|---|
| Total_reads | 0.13 | -0.26 | -0.28 | -0.28 | -0.28 | -0.28 |
| Mapped | 0.21 | -0.25 | -0.26 | -0.27 | -0.27 | -0.27 |
| MND | 0.51 | 0.04 | 0.02 | 0.00 | 0.00 | 0.00 |
| MEND | 0.44 | 0.08 | 0.09 | 0.10 | 0.11 | 0.12 |
expr_gene_and_read_counts_to_plot <- counts_and_meta %>%
pivot_longer (cols = c(MND, MEND, Total_reads, Mapped), names_to = "Read_type", values_to = "Read_count") %>%
pivot_longer (cols = starts_with("genes_expressed"), names_to = "Expression_threshold", values_to = "Gene_count") %>%
mutate(Read_type = factor(Read_type, levels = read_type_names))
datasets_with_normal_expressed_gene_numbers <- counts_and_meta %>%
filter(genes_expressed_above_0 > min_genes_gt_0) %>%
pull(dataset_id)
datasets_with_outlier_gene_counts <- counts_and_meta %>%
ungroup %>%
filter(is_outlier(genes_expressed_above_0)) %>%
pull(dataset_id)
datasets_with_outlier_depths <- counts_and_meta %>%
ungroup %>%
filter(is_outlier(Total_reads)) %>%
pull(dataset_id)
plot_gene_count_correlations <- function(this_data = expr_gene_and_read_counts_to_plot, this_plot_title = "gene count correlations"){
these_stats <- this_data %>%
group_by(Read_type, Expression_threshold) %>%
summarize(r_corr = round(cor(Gene_count, Read_count), 2))
# unique(this_data$Read_type)
# dput(unique(this_data$Read_type))
# these_colors_with_MEND
colors_for_correlation_plot <- c("Total_reads"="#1F78B4", "Mapped"="#FF7F00",
"MND"="#E31A1C", "MEND"="#000000")
ggplot(this_data,
aes(x=Read_count/1E6, y=Gene_count/1e3, color = Read_type)) +
geom_point(alpha = 0.5, shape = 20, size =0.1) +
geom_smooth(method = "lm", color = "black") +
geom_label(data = these_stats,
aes(label=paste0("r=", r_corr)),
x=max(this_data$Read_count/1E6),
y=max(this_data$Gene_count/1e3),
hjust = 1, vjust = 1
) +
theme_minimal() +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) +
scale_color_manual(values = colors_for_correlation_plot) +
facet_grid(Read_type~Expression_threshold) +
ggtitle(this_plot_title) +
theme(legend.position="none") +
xlab("Read counts (million)") +
ylab("Gene counts (thousand)")
}
plot_gene_count_correlations(expr_gene_and_read_counts_to_plot,
"Correlations calculated across all datasets")
## `geom_smooth()` using formula 'y ~ x'
plot_gene_count_correlations(expr_gene_and_read_counts_to_plot %>%
filter(dataset_id %in% datasets_with_normal_expressed_gene_numbers),
paste("Correlations in datasets with more than", min_genes_gt_0, "genes expressed above 0"))
## `geom_smooth()` using formula 'y ~ x'
plot_gene_count_correlations(expr_gene_and_read_counts_to_plot %>%
filter(! dataset_id %in% datasets_with_outlier_gene_counts),
paste("Correlations calculated across all but outlier gene count datasets"))
## `geom_smooth()` using formula 'y ~ x'
plot_gene_count_correlations(expr_gene_and_read_counts_to_plot %>%
filter(! dataset_id %in% datasets_with_outlier_depths),
paste("Correlations calculated across all but outlier read depth datasets"))
## `geom_smooth()` using formula 'y ~ x'
plot_gene_count_correlations(expr_gene_and_read_counts_to_plot %>%
filter(! dataset_id %in% datasets_with_outlier_depths,
! dataset_id %in% datasets_with_outlier_gene_counts
),
paste("Correlations calculated across all but datasets with outlier read depth or gene count"))
## `geom_smooth()` using formula 'y ~ x'
plot_gene_count_correlations(
expr_gene_and_read_counts_to_plot %>%
filter(disease == "acute lymphoblastic leukemia"),
"Correlations calculated among acute lymphoblastic leukemia datasets")
## `geom_smooth()` using formula 'y ~ x'
max_total_read_depth <- 250*1e6
# min_genes_gt_0
exclude_for_depth <- counts_and_meta %>%
filter(Total_reads > max_total_read_depth) %>%
pull(dataset_id)
exclude_for_gene_count <- counts_and_meta %>%
filter(genes_expressed_above_0 < min_genes_gt_0) %>%
pull(dataset_id)
this_data <- expr_gene_and_read_counts_to_plot %>%
filter(
Expression_threshold %in% paste0("genes_expressed_above_", 1:3),
! dataset_id %in% exclude_for_depth,
! dataset_id %in% exclude_for_gene_count
) %>%
mutate(Expression_threshold = str_replace(Expression_threshold, "genes_expressed_above_", "genes above "))
n_distinct(this_data$dataset_id)
## [1] 1995
this_plot_title <- paste("Correlations calculated across all but datasets with excessive read depth or low gene count")
these_stats <- this_data %>%
group_by(Read_type, Expression_threshold) %>%
summarize(r_corr = round(cor(Gene_count, Read_count), 2))
colors_for_correlation_plot <- c("Total_reads"="#1F78B4", "Mapped"="#FF7F00",
"MND"="#E31A1C", "MEND"="#000000")
cor_plots_excluding_high_read_depth_and_low_gene_count_datasets <- ggplot(this_data,
aes(x=Read_count/1E6, y=Gene_count/1e3, color = Read_type)) +
geom_point(alpha = 0.5, shape = 20, size =0.1) +
geom_smooth(method = "lm", color = "black") +
geom_label(data = these_stats,
aes(label=paste0("r=", r_corr)),
x=max(this_data$Read_count/1E6),
y=max(this_data$Gene_count/1e3),
hjust = 1, vjust = 1
) +
theme_minimal() +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5),
strip.text = element_text(face = "bold")) +
scale_color_manual(values = colors_for_correlation_plot) +
facet_grid(Read_type~Expression_threshold) +
# ggtitle(this_plot_title) +
theme(legend.position="none") +
scale_x_continuous("Read counts (million)", n.breaks = 4) +
ylab("Gene counts (thousand)")
cor_plots_excluding_high_read_depth_and_low_gene_count_datasets
## `geom_smooth()` using formula 'y ~ x'
# this_data <- expr_gene_and_read_counts_to_plot %>%
# filter(
# Expression_threshold %in% paste0("genes_expressed_above_", 1:3),
# # ! dataset_id %in% datasets_with_outlier_depths,
# # ! dataset_id %in% datasets_with_outlier_gene_counts
# ) %>%
# mutate(Expression_threshold = str_replace(Expression_threshold, "genes_expressed_above_", "genes above "))
#
# this_plot_title <- paste("Correlations calculated across all but datasets with outlier read depth or gene count")
#
# these_stats <- this_data %>%
# group_by(Read_type, Expression_threshold) %>%
# summarize(r_corr = round(cor(Gene_count, Read_count), 2))
#
# # unique(this_data$Read_type)
# # dput(unique(this_data$Read_type))
# # these_colors_with_MEND
#
# colors_for_correlation_plot <- c("Total_reads"="#1F78B4", "Mapped"="#FF7F00",
# "MND"="#E31A1C", "MEND"="#000000")
#
# cor_plots_excluding_read_depth_and_gene_count_outliers <- ggplot(this_data,
# aes(x=Read_count/1E6, y=Gene_count/1e3, color = Read_type)) +
# geom_point(alpha = 0.5, shape = 20, size =0.1) +
# geom_smooth(method = "lm", color = "black") +
# geom_label(data = these_stats,
# aes(label=paste0("r=", r_corr)),
# x=max(this_data$Read_count/1E6),
# y=max(this_data$Gene_count/1e3),
# hjust = 1, vjust = 1
# ) +
# theme_linedraw() +
# scale_color_manual(values = colors_for_correlation_plot) +
# facet_grid(Read_type~Expression_threshold) +
# # ggtitle(this_plot_title) +
# theme(legend.position="none") +
# xlab("Read counts (million)") +
# ylab("Gene counts (thousand)")
f2a <- read_type_fractions_histogram +
theme(strip.text.x = element_text(size = 10, face = "bold"),
axis.title.x=element_blank(),
axis.text.x=element_blank())
f2b <- read_type_fractions_per_cohort_boxplot +
theme(strip.text.x = element_blank(),
strip.background = element_blank(),
strip.text = element_blank())
f2ab <- plot_grid(f2a,
f2b,
ncol = 1,
rel_heights = c(1.5, 6),
align = "v",
labels = "AUTO")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
f2c <- plot_grid(cor_plots_excluding_high_read_depth_and_low_gene_count_datasets,
labels = "C")
## `geom_smooth()` using formula 'y ~ x'
f2 <- plot_grid(f2ab,
f2c,
nrow=1,
rel_widths = c(4,2.3))
figure_name <- "f2"
f2 + draw_label(paste(figure_name, Sys.time()),
x = 0, y = 0.02, hjust = 0, size = 6)
readDist_data_raw <- read_tsv("readDist/readDist_groomed.2020_06_09.txt")
## Parsed with column specification:
## cols(
## Measurement = col_character(),
## Count = col_double(),
## Measurement_type = col_character(),
## source_file = col_character()
## )
readDist_data <- readDist_data_raw %>%
rename(dataset_id = source_file) %>%
unite("new_col_name", c(Measurement, Measurement_type)) %>%
pivot_wider(names_from = new_col_name, values_from = Count) %>%
mutate(
Total_exonic_tag_count = `CDS_Exons_Tag count` + `5'UTR_Exons_Tag count` + `3'UTR_Exons_Tag count`,
Total_genic_tag_count = `Introns_Tag count` + Total_exonic_tag_count,
gene_adjacent_tag_count=Total_genic_tag_count + `TSS_up_10kb_Tag count` + `TES_down_10kb_Tag count`)
expr_gene_and_sub_read_counts <- counts_and_meta %>%
left_join(readDist_data, by=c("dataset_id"))
plot_cors <- function(cors_input=expr_gene_and_sub_read_counts, plot_title="Correlations"){
cors <- cors_input %>%
ungroup %>%
select_if(is.double) %>%
correlate()
# fashion(cors) %>% View()
cors_tile <- cors %>%
rename(Measure=rowname) %>%
filter (! str_detect(Measure, "^genes_expressed.*")) %>%
select(c(Measure, starts_with("genes_expressed"))) %>%
pivot_longer(-Measure) %>%
group_by(Measure) %>%
mutate(
measure_cor_with_genes_expressed_above_0 = value[name == "genes_expressed_above_0"],
measure_cor_with_genes_expressed_above_1 = value[name == "genes_expressed_above_1"],
measure_cor_with_genes_expressed_above_2 = value[name == "genes_expressed_above_2"],
measure_cor_with_genes_expressed_above_3 = value[name == "genes_expressed_above_3"]) %>%
ungroup
p0<-ggplot(cors_tile %>%
mutate(Measure = fct_reorder(factor(Measure), measure_cor_with_genes_expressed_above_0)),
aes(y=Measure, x=name, fill = value)) +
geom_tile() +
geom_text(aes(label=round(value, 2))) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
ggtitle(paste(plot_title, "\nordered by correlation with count of genes expressed above 0"))
p1<-ggplot(cors_tile %>%
mutate(Measure = fct_reorder(factor(Measure), measure_cor_with_genes_expressed_above_1)),
aes(y=Measure, x=name, fill = value)) +
geom_tile() +
geom_text(aes(label=round(value, 2))) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
ggtitle(paste(plot_title, "\nordered by correlation with count of genes expressed above 1"))
p2<-ggplot(cors_tile %>%
mutate(Measure = fct_reorder(factor(Measure), measure_cor_with_genes_expressed_above_2)),
aes(y=Measure, x=name, fill = value)) +
geom_tile() +
geom_text(aes(label=round(value, 2))) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
ggtitle(paste(plot_title, "\nordered by correlation with count of genes expressed above 2"))
p3<-ggplot(cors_tile %>%
mutate(Measure = fct_reorder(factor(Measure), measure_cor_with_genes_expressed_above_3)),
aes(y=Measure, x=name, fill = value)) +
geom_tile() +
geom_text(aes(label=round(value, 2))) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
ggtitle(paste(plot_title, "\nordered by correlation with count of genes expressed above 3"))
return(list(p0, p1, p2, p3))
}
plot_cors(expr_gene_and_sub_read_counts, "All datasets")
##
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
## Subsets
plot_cors(expr_gene_and_sub_read_counts%>%
filter(dataset_id %in% datasets_with_normal_expressed_gene_numbers),
paste("datasets with more than", min_genes_gt_0, "genes expressed above 0"))
##
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
plot_cors(expr_gene_and_sub_read_counts%>%
filter(! dataset_id %in% datasets_with_outlier_gene_counts),
paste("datasets with non-outlier gene counts"))
##
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
plot_cors(expr_gene_and_sub_read_counts%>%
filter(! dataset_id %in% datasets_with_outlier_depths,
! dataset_id %in% datasets_with_outlier_gene_counts
),
paste("datasets with non-outlier gene counts and non outlier depths"))
##
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
plot_cors(expr_gene_and_sub_read_counts%>%
filter(disease == "acute lymphoblastic leukemia"),
"All acute lymphoblastic leukemia datasets")
##
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
ggplot(read_counts_with_read_type_fractions) +
geom_point(aes(x=seq_length, y=pct_dupe_of_mapped))
## Warning: Removed 245 rows containing missing values (geom_point).
ggplot(read_counts_with_read_type_fractions) +
geom_boxplot(aes(x=seq_length, y=pct_dupe_of_mapped, group = seq_length))
## Warning: Removed 245 rows containing missing values (stat_boxplot).
cor_tot_reads_and_dupes <- cor(read_counts_with_read_type_fractions$Total_reads,
read_counts_with_read_type_fractions$pct_dupe_of_mapped,
method = c("pearson"))
round(cor_tot_reads_and_dupes, 2)
## [1] 0.58
round(cor_tot_reads_and_dupes^2, 2)
## [1] 0.34